data <- read.csv("lyon_housing.csv")
View(data)
summary(data)
## date_transaction type_purchase type_property rooms_count
## Length:40516 Length:40516 Length:40516 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:2.000
## Mode :character Mode :character Mode :character Median :3.000
## Mean :2.792
## 3rd Qu.:4.000
## Max. :6.000
##
## surface_housing surface_effective_usable surface_terrain parkings_count
## Min. : 20.00 Min. : 2.81 Min. : 0.000 Min. :0.0000
## 1st Qu.: 45.00 1st Qu.: 44.70 1st Qu.: 0.000 1st Qu.:0.0000
## Median : 63.00 Median : 62.80 Median : 0.000 Median :1.0000
## Mean : 65.24 Mean : 63.53 Mean : 3.765 Mean :0.5993
## 3rd Qu.: 80.00 3rd Qu.: 78.11 3rd Qu.: 0.000 3rd Qu.:1.0000
## Max. :300.00 Max. :270.67 Max. :944.000 Max. :3.0000
## NA's :14491
## price address district latitude
## Min. : 24000 Length:40516 Length:40516 Min. :45.72
## 1st Qu.: 155272 Class :character Class :character 1st Qu.:45.75
## Median : 220000 Mode :character Mode :character Median :45.76
## Mean : 255759 Mean :45.76
## 3rd Qu.: 309000 3rd Qu.:45.77
## Max. :2780000 Max. :45.81
## NA's :143
## longitude date_construction
## Min. :4.773 Length:40516
## 1st Qu.:4.831 Class :character
## Median :4.854 Mode :character
## Mean :4.852
## 3rd Qu.:4.876
## Max. :4.920
## NA's :143
print(paste("total NA cells of: ", sum(is.na(data))))
## [1] "total NA cells of: 14777"
colSums(is.na(data))
## date_transaction type_purchase type_property
## 0 0 0
## rooms_count surface_housing surface_effective_usable
## 0 0 14491
## surface_terrain parkings_count price
## 0 0 0
## address district latitude
## 0 0 143
## longitude date_construction
## 143 0
print(paste("total null cells of: ", sum(is.null(data))))
## [1] "total null cells of: 0"
if (which(is.na(data$latitude)) == which(is.na(data$longitude))){
print("they belong to same rows")
} else {
print("they don't belong to same rows")
}
## Warning in if (which(is.na(data$latitude)) == which(is.na(data$longitude))) {:
## the condition has length > 1 and only the first element will be used
## [1] "they belong to same rows"
data$surface_effective_usable <- ifelse(is.na(data$surface_effective_usable), data$surface_housing, data$surface_effective_usable)
data = data[complete.cases(data), ]
print(paste("total NA cells of: ", sum(is.na(data))))
## [1] "total NA cells of: 0"
sapply(data, class)
## date_transaction type_purchase type_property
## "character" "character" "character"
## rooms_count surface_housing surface_effective_usable
## "integer" "integer" "numeric"
## surface_terrain parkings_count price
## "integer" "integer" "numeric"
## address district latitude
## "character" "character" "numeric"
## longitude date_construction
## "numeric" "character"
data.frame(min=sapply(data,min),max=sapply(data,max))
## min max
## date_transaction 16/07/01 21/06/30
## type_purchase ancien VEFA
## type_property appartement maison
## rooms_count 1 6
## surface_housing 20 300
## surface_effective_usable 2.81 286
## surface_terrain 0 944
## parkings_count 0 3
## price 24000 2780000
## address 99 RUE TETE D'OR
## district Lyon 1er Arrondissement Villeurbanne
## latitude 45.722048 45.805458
## longitude 4.773162 4.920161
## date_construction 00/01/08 11:38 99/12/17 11:38
data$date_construction <- as.Date(data$date_construction, format = '%y/%m/%d')
data$date_transaction <- as.Date(data$date_transaction, format = '%y/%m/%d')
data <- subset(data, !(data$date_transaction > Sys.Date() & data$type_purchase != "VEFA"))
data <- subset(data, !(data$date_transaction - data$date_construction < 0 & data$type_purchase != "VEFA"))
# my_plots <- lapply(names(data), function(var_x){
# p <- ggplot(data) + aes_string(var_x)
#
# if(is.numeric(data[[var_x]])) {
# p <- p + geom_density()
#
# } else {
# p <- p + geom_bar()
# }
#
# })
# plot_grid(plotlist = my_plots)
barplot(table(data$type_purchase), col = rgb(0.2, 0.2, 0.3))
barplot(table(data$type_property), col = rgb(0.8, 0.3, 0.8))
#### For the room count, 3 bedrooms is the most popular one. Next are the houses with 2, 4, 1, 5 and 6 bedrooms respectively.
hist(data$rooms_count, col = rgb(0.4, 0.1, 0.4))
#### Also looking at surface_housing and surface_effective_useable, they almost have the same distribution. Let’s compute their correlation:
cor(data$surface_housing, data$surface_effective_usable)
## [1] 0.9765241
data <- data %>% select(-surface_effective_usable)
hist(data$surface_terrain, col = rgb(0.7, 0.7, 0.4))
#### Almost all of its values are zero. Let’s see the percentage of zeros:
percentage = colSums(data==0)/nrow(data)*100
percentage
## date_transaction type_purchase type_property rooms_count
## 0.00000 0.00000 0.00000 0.00000
## surface_housing surface_terrain parkings_count price
## 0.00000 98.66326 45.62181 0.00000
## address district latitude longitude
## 0.00000 0.00000 0.00000 0.00000
## date_construction
## 0.00000
data = subset(data, select = -surface_terrain)
hist(data$parkings_count, col = rgb(0.5, 0.2, 0.1))
#### Another important factor is house price:
hist(data$price, col = rgb(0.1, 0.5, 0.1))
#### Most of the houses have a price around 250000.
barplot(table(data$district), col = rgb(0.2, 0.3, 0.2))
#### As you can see, the last district contains almost more then twice the number of houses compared to other districts.
data$property_age <- data$date_transaction - data$date_construction
res <- rcorr(as.matrix(data[, c(4, 5, 6, 7)]))
res
## rooms_count surface_housing parkings_count price
## rooms_count 1.00 0.84 0.27 0.56
## surface_housing 0.84 1.00 0.22 0.75
## parkings_count 0.27 0.22 1.00 0.19
## price 0.56 0.75 0.19 1.00
##
## n= 39948
##
##
## P
## rooms_count surface_housing parkings_count price
## rooms_count 0 0 0
## surface_housing 0 0 0
## parkings_count 0 0 0
## price 0 0 0
corrplot(res$r, type="upper", order="hclust",
sig.level = 0.01, insig = "blank")
#### we can also plot a chart for their correlation
my_data <- data[, c(4,5,6,7)]
chart.Correlation(my_data, histogram=TRUE, pch=19)
### Use the data from subway stations file
dfs <- fromJSON('station_coordinates.json')
dfA <- dfs$A
dfB <- dfs$B
dfC <- dfs$D
dfD <- dfs$D
dfT1 <- dfs$T1
dfT2 <- dfs$T2
dfT3 <- dfs$T3
dfT4 <- dfs$T4
dfT5 <- dfs$T5
dfT6 <- dfs$T6
dfT7 <- dfs$T7
stations = c()
lat = c()
long = c()
stations_list <- list(dfA, dfB, dfC, dfD, dfT1, dfT2, dfT3, dfT4, dfT5, dfT6, dfT7)
for (s in stations_list){
stations = c(stations, s$stations)
lat = c(lat, s$latitudes)
long = c(long, s$longitudes)
}
# library(geosphere)
# nearest_station = c()
# station_distance = c()
# for (j in (1: nrow(data))){
# stas = c()
# for (i in (1:length(stations))){
# distance = distm(c(long[i], lat[i]), c(data[['longitude']][i], data[['latitude']][i]), fun = distHaversine)
# stas = c(stas, distance)
# }
# station_distance = c(station_distance, min(stas))
# nearest_station = c(nearest_station, which.min(stas))
# }
# data['station_distance'] <- station_distance
# data['nearest_station'] <- nearest_station
plot(density(data$price))
#### To check if it has a normal distribution, we can perform the shapiro test on it. Take the null hypothesis to be that this distribution is normal. Hence, if test is significant, it is not normal. First, take a sample of size 50 and perform the test on them.
set.seed(1000)
random_sample <- dplyr::sample_n(data, 50)
plot(density(random_sample$price))
shapiro.test(random_sample$price)
##
## Shapiro-Wilk normality test
##
## data: random_sample$price
## W = 0.96998, p-value = 0.231
plot(density(data$surface_housing))
#### Similarly, we perform the shapiro test on it. Take the null hypothesis to be that this distribution is normal.
set.seed(1000)
random_sample <- dplyr::sample_n(data, 50)
plot(density(random_sample$surface_housing))
shapiro.test(random_sample$surface_housing)
##
## Shapiro-Wilk normality test
##
## data: random_sample$surface_housing
## W = 0.96572, p-value = 0.1542
reg = lm(price ~ surface_housing, data)
summary(reg)
##
## Call:
## lm(formula = price ~ surface_housing, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -854382 -57289 -1824 49988 2463407
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11180.11 1292.88 -8.647 <2e-16 ***
## surface_housing 4097.16 18.18 225.325 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 102700 on 39946 degrees of freedom
## Multiple R-squared: 0.5597, Adjusted R-squared: 0.5597
## F-statistic: 5.077e+04 on 1 and 39946 DF, p-value: < 2.2e-16
ggscatter(data, x = "surface_housing", y = "price",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "surface housing", ylab = "price")
## `geom_smooth()` using formula 'y ~ x'
ggplot() +geom_point( data=data, aes(y=latitude, x=longitude, color = as.factor(district)), size = 0.4)
#### now apply the price factor to this:
ggplot()+
geom_point(
data = data,
mapping = aes(x = longitude, y = latitude, color = price, shape = type_property)
, size = 1.5
)+ scale_color_gradientn(colours=c("blue","red"))+scale_shape_manual(values = c(4,15))
#### It seems like there is not much of a significant difference between districts. But we can see that the average price in dist 1, 2, 4, 6 might be a bit higher. #### let’s also check this doing regression
reg <- lm(price ~ district, data)
summary(reg)
##
## Call:
## lm(formula = price ~ district, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -365525 -87764 -25549 56276 2576346
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 291398 3409 85.482 < 2e-16 ***
## districtLyon 2e Arrondissement 71366 5144 13.874 < 2e-16 ***
## districtLyon 3e Arrondissement -17675 3893 -4.540 5.63e-06 ***
## districtLyon 4e Arrondissement 43336 4589 9.444 < 2e-16 ***
## districtLyon 5e Arrondissement -40930 4350 -9.409 < 2e-16 ***
## districtLyon 6e Arrondissement 104126 4495 23.163 < 2e-16 ***
## districtLyon 7e Arrondissement -42474 3957 -10.734 < 2e-16 ***
## districtLyon 8e Arrondissement -62641 4030 -15.543 < 2e-16 ***
## districtLyon 9e Arrondissement -71849 4244 -16.929 < 2e-16 ***
## districtVilleurbanne -87745 3698 -23.725 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 144900 on 39938 degrees of freedom
## Multiple R-squared: 0.1231, Adjusted R-squared: 0.1229
## F-statistic: 622.8 on 9 and 39938 DF, p-value: < 2.2e-16
ggscatter(data, x = "district", y = "price",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "district", ylab = "price")
## `geom_smooth()` using formula 'y ~ x'
#### here we can also see that since R is small, so they are really related.
reg = lm(price ~ rooms_count, data)
summary(reg)
##
## Call:
## lm(formula = price ~ rooms_count, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -378427 -68872 -12202 46593 2435583
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52396.6 1649.2 31.77 <2e-16 ***
## rooms_count 73005.0 544.2 134.16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 128500 on 39946 degrees of freedom
## Multiple R-squared: 0.3106, Adjusted R-squared: 0.3106
## F-statistic: 1.8e+04 on 1 and 39946 DF, p-value: < 2.2e-16
ggscatter(data, x = "rooms_count", y = "price",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "rooms count", ylab = "price")
## `geom_smooth()` using formula 'y ~ x'
lat = 45.780234113880425
l = 4.865561717882041
ggplot() +geom_point( data=data, aes(y=latitude, x=longitude, color = as.factor(district)), size = 0.4)+geom_circle(aes(x0 = l, y0 = lat, r = 0.01))
houses <-subset(data,(district == "Villeurbanne" | district=="Lyon 6e Arrondissement") & (rooms_count > 2) & (parkings_count > 0))
normal_prices=houses$price/mean(houses$price)
normal_area=houses$surface_housing/mean(houses$surface_housing)
score <-10*normal_area-5*normal_prices
i=which(score==max(score))
print(paste(houses$longitude[i], houses$latitude[i]))
## [1] "4.880895 45.764189"
i=which(score==max(score))
long=houses$longitude[i]
lat=houses$latitude[i]
ggplot() +geom_point( data=data, aes(y=latitude, x=longitude, color = as.factor(district)), size = 0.4)+geom_circle(aes(x0 = long, y0 = lat, r = 0.005))